import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import session_info
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, GroupKFold, GroupShuffleSplit, RepeatedStratifiedKFold, RepeatedKFold
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, cross_validate
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.model_selection import KFold
import pingouin as pg
from catboost import CatBoostRegressor, Pool
# import xgboost
import shap
shap.initjs()
_ = sns.set_style("whitegrid")
session_info.show(req_file_name="corona_preppers-requirements.txt",
write_req_file=False) #add write_req_file=True to function to get requirements.txt file of packages used
----- catboost 1.0.3 matplotlib 3.3.1 numpy 1.19.1 pandas 1.1.3 pingouin 0.5.0 seaborn 0.11.0 session_info 1.0.0 shap 0.39.0 sklearn 0.23.2 -----
PIL 8.0.0 appnope 0.1.0 backcall 0.2.0 beta_ufunc NA binom_ufunc NA brotli NA certifi 2021.10.08 cffi 1.14.3 chardet 3.0.4 cloudpickle 1.6.0 colorama 0.4.4 cycler 0.10.0 cython_runtime NA dateutil 2.8.1 decorator 4.4.2 idna 2.10 ipykernel 5.3.4 ipython_genutils 0.2.0 jedi 0.18.1 joblib 0.17.0 kiwisolver 1.2.0 littleutils NA llvmlite 0.34.0 mpl_toolkits NA nbinom_ufunc NA numba 0.51.2 outdated 0.2.1 pandas_flavor NA parso 0.8.0 pexpect 4.8.0 pickleshare 0.7.5 pkg_resources NA prompt_toolkit 3.0.8 ptyprocess 0.6.0 pygments 2.7.1 pyparsing 2.4.7 pytz 2020.1 requests 2.24.0 scipy 1.7.1 six 1.15.0 slicer NA socks 1.7.1 statsmodels 0.12.0 storemagic NA tabulate 0.8.7 threadpoolctl 2.1.0 tornado 6.0.4 tqdm 4.50.2 traitlets 5.0.5 urllib3 1.25.11 wcwidth 0.2.5 xarray 0.16.1 zmq 19.0.2
----- IPython 7.18.1 jupyter_client 6.1.7 jupyter_core 4.6.3 jupyterlab 2.2.6 notebook 6.1.4 ----- Python 3.8.5 (default, Sep 4 2020, 02:22:02) [Clang 10.0.0 ] macOS-10.16-x86_64-i386-64bit ----- Session information updated at 2021-12-10 21:33
df = pd.read_csv("data/shield_gjames_21-09-20_prepped.csv").drop("Unnamed: 0", axis=1)
df.head()
| id | sampling_weight | demographic_gender | demographic_age | demographic_4_areas | demographic_8_areas | demographic_higher_education | behaviour_indoors_nonhouseholders | behaviour_close_contact | behaviour_quarantined | ... | intention_public_transport_recoded | intention_indoor_meeting_recoded | intention_restaurant_recoded | intention_pa_recoded | intention_composite | behaviour_indoors_nonhouseholders_recoded | behaviour_unmasked_recoded | behavior_composite | behavior_composite_recoded | intention_behavior_composite | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2.060959 | 2 | 60+ | 2 | 7 | 0 | 2 | 5 | 2 | ... | 0 | 0 | 0 | 0 | 0 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 1 | 2 | 1.784139 | 2 | 40-49 | 1 | 1 | 1 | 3 | 3 | 2 | ... | 0 | 1 | 1 | 1 | 3 | 0.785714 | 0.214286 | 0.168367 | 0.841837 | 1.920918 |
| 2 | 3 | 1.204000 | 1 | 60+ | 1 | 2 | 1 | 4 | 4 | 2 | ... | 0 | 0 | 0 | 0 | 0 | 0.500000 | 0.214286 | 0.107143 | 0.535714 | 0.267857 |
| 3 | 4 | 2.232220 | 1 | 60+ | 2 | 6 | 0 | 4 | 3 | 2 | ... | 0 | 2 | 0 | 2 | 4 | 0.500000 | 0.500000 | 0.250000 | 1.250000 | 2.625000 |
| 4 | 5 | 1.627940 | 2 | 18-29 | 1 | 3 | 0 | 6 | 3 | 2 | ... | 0 | 2 | 0 | 0 | 2 | 0.000000 | 0.214286 | 0.000000 | 0.000000 | 1.000000 |
5 rows × 106 columns
sdt_columns = df.filter(regex="sdt").columns.tolist()
drop_sdt = True
if drop_sdt:
df=df.drop(sdt_columns, axis=1)
df.shape
(2272, 87)
target = "intention_behavior_composite"
df[target] = (df[target] - 10) * -1
features_list = df.filter(regex="^automaticity|attitude|^norms|^risk|^effective").columns.tolist()
meta_columns = ['Original position', 'Variable name', 'Label',
'Item english translation', 'Label short', 'Type', 'New variable name',
'variable name helper',
'Of primary interest as a predictor (i.e. feature)?', 'English lo-anchor',
'English hi-anchor']
meta_df = pd.read_excel("metadata/questionLabels.xlsx").loc[:, meta_columns]
meta_list = df.filter(regex="^automaticity|attitude|^norms|^risk|^effective|^behaviour|^intention").columns.tolist()
pd.set_option("display.max_colwidth", 350)
pd.set_option('display.expand_frame_repr', True)
meta_df.loc[meta_df["New variable name"].isin(meta_list), ["Item english translation", "New variable name"]]
| Item english translation | New variable name | |
|---|---|---|
| 12 | How often in the last 7 days have you been indoors with people outside your household so that it is not related to obligations? For example, meeting friends, visiting hobbies, non-essential shopping, or other activities that are not required for your work or other duties. | behaviour_indoors_nonhouseholders |
| 13 | In the last 7 days, have you been in close contact with people outside your household? Direct contact means spending more than one minute less than two meters away from another person or touching (e.g., shaking hands) outdoors or indoors. | behaviour_close_contact |
| 14 | Are you currently in quarantine or isolation due to an official instruction or order? (For example, because you are waiting for a corona test, have returned from abroad or been exposed to a coronavirus) | behaviour_quarantined |
| 15 | How often in the last 7 days were you in your free time without a mask indoors with people you don’t live with? | behaviour_unmasked |
| 24 | If in the next 7 days you go to visit the following indoor spaces and there are people outside your household, Are you going to wear a mask? Grocery store or other store | intention_store |
| 25 | If in the next 7 days you go to visit the following indoor spaces and there are people outside your household, Are you going to wear a mask? Bus, train or other means of public transport | intention_public_transport |
| 26 | If in the next 7 days you go to visit the following indoor spaces and there are people outside your household, Are you going to wear a mask? Meeting people outside your household indoors | intention_indoor_meeting |
| 27 | If in the next 7 days you go to visit the following indoor spaces and there are people outside your household, Are you going to wear a mask? Cafe, restaurant or bar indoors | intention_restaurant |
| 28 | If in the next 7 days you go to visit the following indoor spaces and there are people outside your household, Are you going to wear a mask? Indoor exercise | intention_pa |
| 29 | Taking a mask with you to a store or public transport, for example, has already become automatic for some and is done without thinking. For others, taking a mask with them is not automatic at all, but requires conscious thinking and effort. | automaticity_carry_mask |
| 30 | Putting on a mask, for example in a shop or on public transport, has already become automatic for some and it happens without thinking. For others, putting on a mask is not automatic at all, but requires conscious thinking and effort. | automaticity_put_on_mask |
| 32 | What consequences do you think it has if you use a face mask in your free time? If or when I use a face mask… | inst_attitude_protects_self |
| 33 | What consequences do you think it has if you use a face mask in your free time? If or when I use a face mask… | inst_attitude_protects_others |
| 34 | What consequences do you think it has if you use a face mask in your free time? If or when I use a face mask… | inst_attitude_sense_of_community |
| 35 | What consequences do you think it has if you use a face mask in your free time? If or when I use a face mask… | inst_attitude_enough_oxygen |
| 36 | What consequences do you think it has if you use a face mask in your free time? If or when I use a face mask… | inst_attitude_no_needless_waste |
| 37 | Who thinks you should use a face mask and who thinks not? In the following questions, by using a face mask, we mean holding a cloth or disposable face mask, surgical mask, or respirator on the face so that it covers the nose and mouth. The questions concern leisure time. My family and friends think I should .. | norms_family_friends |
| 38 | People at risk think I should .. | norms_risk_groups |
| 39 | The authorities think I should .. | norms_officials |
| 40 | In the indoors spaces I visit, people on the site think I should… | norms_people_present_indoors |
| 41 | When I use a face mask, I feel or would feel ... | aff_attitude_comfortable |
| 42 | When I use a face mask, I feel or would feel ... | aff_attitude_calm |
| 43 | When I use a face mask, I feel or would feel ... | aff_attitude_safe |
| 44 | When I use a face mask, I feel or would feel ... | aff_attitude_responsible |
| 45 | When I use a face mask, I feel or would feel ... | aff_attitude_difficult_breathing |
| 61 | If two unvaccinated people from different households meet indoors, what means do you think would be effective in preventing coronavirus infection? Hand washing and use of gloves | effective_means_handwashing |
| 62 | Using a face mask | effective_means_masks |
| 63 | Keeping a safety distance (2 meters) | effective_means_distance |
| 64 | Ventilation | effective_means_ventilation |
| 65 | How likely do you think you will get a coronavirus infection in your free time in the next month? | risk_likely_contagion |
| 66 | How likely do you think you would get a coronavirus infection in your free time in the next month if you did nothing to protect yourself from it? | risk_contagion_absent_protection |
| 67 | If you got a coronavirus infection, how serious a threat would you rate it to your health? | risk_severity |
| 68 | Spread of coronavirus… | risk_fear_spread |
| 69 | The fact that I would get infected myself .. | risk_fear_contagion_self |
| 70 | That my loved one would get infected... | risk_fear_contagion_others |
| 71 | Consequences of measures taken to prevent the spread of the coronavirus... | risk_fear_restrictions |
pd.set_option("display.max_colwidth", 100)
Check the amount of samples in the target
_ = sns.violinplot(data=df[[target]].melt(),
x="variable",
y="value"
)
_ = sns.stripplot(data=df[[target]].melt(),
x="variable",
y="value",
edgecolor='white',
linewidth=0.5
)
pd.crosstab(df["demographic_gender"], df["demographic_age"])
| demographic_age | 18-29 | 30-39 | 40-49 | 50-59 | 60+ |
|---|---|---|---|---|---|
| demographic_gender | |||||
| 1 | 114 | 169 | 187 | 168 | 337 |
| 2 | 281 | 185 | 229 | 211 | 391 |
target_df = df[target]
target_df.describe().to_frame().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| intention_behavior_composite | 2272.0 | 8.582428 | 1.524704 | -0.0 | 8.017857 | 8.964286 | 9.5 | 10.0 |
_ = plt.figure(figsize=(20, 5))
_ = sns.countplot(x=target_df)
_ = plt.xticks(rotation=90)
df[features_list] = df[features_list].astype("category")
df = (df[["demographic_age", "demographic_higher_education"] + features_list + [target]])
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2272 entries, 0 to 2271 Data columns (total 30 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 demographic_age 2272 non-null object 1 demographic_higher_education 2272 non-null int64 2 automaticity_carry_mask 2272 non-null category 3 automaticity_put_on_mask 2272 non-null category 4 inst_attitude_protects_self 2272 non-null category 5 inst_attitude_protects_others 2272 non-null category 6 inst_attitude_sense_of_community 2272 non-null category 7 inst_attitude_enough_oxygen 2272 non-null category 8 inst_attitude_no_needless_waste 2272 non-null category 9 norms_family_friends 2272 non-null category 10 norms_risk_groups 2272 non-null category 11 norms_officials 2272 non-null category 12 norms_people_present_indoors 2272 non-null category 13 aff_attitude_comfortable 2272 non-null category 14 aff_attitude_calm 2272 non-null category 15 aff_attitude_safe 2272 non-null category 16 aff_attitude_responsible 2272 non-null category 17 aff_attitude_difficult_breathing 2272 non-null category 18 effective_means_handwashing 2272 non-null category 19 effective_means_masks 2272 non-null category 20 effective_means_distance 2272 non-null category 21 effective_means_ventilation 2272 non-null category 22 risk_likely_contagion 2272 non-null category 23 risk_contagion_absent_protection 2272 non-null category 24 risk_severity 2272 non-null category 25 risk_fear_spread 2272 non-null category 26 risk_fear_contagion_self 2272 non-null category 27 risk_fear_contagion_others 2272 non-null category 28 risk_fear_restrictions 2272 non-null category 29 intention_behavior_composite 2272 non-null float64 dtypes: category(27), float64(1), int64(1), object(1) memory usage: 123.2+ KB
grouping_var = target
display(df[grouping_var].value_counts().head().to_frame()), df.shape[0], df[grouping_var].value_counts().head().sum()
| intention_behavior_composite | |
|---|---|
| 10.000000 | 424 |
| 9.500000 | 228 |
| 9.000000 | 187 |
| 8.885204 | 155 |
| 9.385204 | 112 |
(None, 2272, 1106)
def naive_catboost_forest_summary(df: pd.DataFrame,
grouping_var: str,
column_list: list,
plot_title: str
):
y = df[grouping_var]
X = df[column_list]
feature_plot, ax = plt.subplots(figsize=(10,7))
_ = sns.boxplot(ax=ax,
data=X.apply(lambda x: x.cat.codes),
orient="v",
)
_ = plt.title(f'Feature Distributions {plot_title}')
_ = plt.setp(ax.get_xticklabels(), rotation=90)
_ = plt.grid()
_ = plt.tight_layout()
_ = plt.show()
model = CatBoostRegressor(iterations=500,
depth=None,
learning_rate=1,
loss_function='RMSE',
verbose=False)
# train the model
_ = model.fit(X, y, cat_features=column_list)
# create dataframe with importances per feature
feature_importance = pd.Series(dict(zip(column_list, model.feature_importances_.round(2))))
feature_importance_df = pd.DataFrame(feature_importance.sort_values(ascending=False)).reset_index().rename(columns={"index": "feature", 0: "feature_importance"})
_ = plt.figure(figsize=(7, 7))
gini_plot = sns.barplot(data=feature_importance_df,
x="feature_importance",
y="feature")
_ = plt.title(f'Feature Importance {plot_title}')
_ = plt.show()
shap_values = model.get_feature_importance(Pool(X, label=y,cat_features=X.columns.tolist()), type="ShapValues")
shap_values = shap_values[:,:-1]
_ = shap.summary_plot(shap_values,
X.astype(int),
feature_names=X.columns,
max_display=X.shape[1],
show=False,
title=plot_title)
shap_plot = plt.gca()
tmp_actual = (X
.melt(value_name='actual_value')
)
tmp_shap = (pd.DataFrame(shap_values, columns=column_list)
.melt(value_name='shap_value')
)
shap_actual_df = pd.concat([tmp_actual, tmp_shap[["shap_value"]]], axis=1)
y_pred = model.predict(X)
df_test = pd.DataFrame({"y_pred": y_pred, grouping_var: y})
user_ids_first = df_test.head(1).index.tolist()[0]
user_ids_last = df_test.tail(1).index.tolist()[0]
_ = plt.figure(figsize=(30,8))
_ = plt.title(f"Catboost Regressor(fitted set) | RMSE = {round(np.sqrt(mean_squared_error(df_test['y_pred'], df_test[grouping_var])),4)} | bias Error = {round(np.mean(df_test['y_pred'] - df_test[grouping_var]), 4)} | {plot_title}")
rmse_plot = plt.stem(df_test.index, df_test['y_pred'] - df_test[grouping_var], use_line_collection=True, linefmt='grey', markerfmt='D')
_ = plt.hlines(y=round(np.sqrt(mean_squared_error(df_test['y_pred'], df_test[grouping_var])),2), colors='b', linestyles='-.', label='+ RMSE',
xmin = user_ids_first,
xmax = user_ids_last
)
_ = plt.hlines(y=round(-np.sqrt(mean_squared_error(df_test['y_pred'], df_test[grouping_var])),2), colors='b', linestyles='-.', label='- RMSE',
xmin = user_ids_first,
xmax = user_ids_last
)
_ = plt.xticks(rotation=90, ticks=df_test.index)
_ = plt.ylabel(f"'Error = y_predicted - {grouping_var}'")
_ = plt.legend()
_ = plt.show()
return feature_plot, gini_plot.get_figure(), shap_plot.get_figure(), rmse_plot, feature_importance_df, shap_actual_df
# %%capture
feature_plot_0, gini_plot_0, shap_plot_0, rmse_plot_0, feature_importance_df_0, shap_values_0 = naive_catboost_forest_summary(df = df[df["demographic_age"].isin(['18-29', '30-39'])],
grouping_var = grouping_var,
column_list = features_list,
plot_title="18 - 39"
)
feature_plot_1, gini_plot_1, shap_plot_1, rmse_plot_1, feature_importance_df_1, shap_values_1 = naive_catboost_forest_summary(df = df[df["demographic_age"].isin(['40-49', '50-59'])],
grouping_var = grouping_var,
column_list = features_list,
plot_title="40 - 59"
)
feature_plot_2, gini_plot_2, shap_plot_2, rmse_plot_2, feature_importance_df_2, shap_values_2 = naive_catboost_forest_summary(df = df[df["demographic_age"].isin(['60+'])],
grouping_var = grouping_var,
column_list = features_list,
plot_title="60+"
)
feature_plot_3, gini_plot_3, shap_plot_3, rmse_plot_3, feature_importance_df_3, shap_values_3 = naive_catboost_forest_summary(df = df,
grouping_var = grouping_var,
column_list = features_list,
plot_title="All"
)
feature_plot_4, gini_plot_4, shap_plot_4, rmse_plot_4, feature_importance_df_4, shap_values_4 = naive_catboost_forest_summary(df = df[df[grouping_var]!=10],
grouping_var = grouping_var,
column_list = features_list,
plot_title="All - No 10's in target"
)
feature_plot_5, gini_plot_5, shap_plot_5, rmse_plot_5, feature_importance_df_5, shap_values_5 = naive_catboost_forest_summary(df = df[df["demographic_higher_education"]==0],
grouping_var = grouping_var,
column_list = features_list,
plot_title="Lower Education"
)
feature_plot_6, gini_plot_6, shap_plot_6, rmse_plot_6, feature_importance_df_6, shap_values_6 = naive_catboost_forest_summary(df = df[df["demographic_higher_education"]==1],
grouping_var = grouping_var,
column_list = features_list,
plot_title="Higher Education"
)
fig, axs = plt.subplots(nrows=1,
ncols=4,
sharex=True,
sharey=False,
figsize=(30, 7),
gridspec_kw={'wspace': 0.75})
fi_dfs_list = [feature_importance_df_0, feature_importance_df_1, feature_importance_df_2, feature_importance_df_3]
fi_titles_list = ["18 - 39", "40 - 59", "60+", "All"]
for i in range(0, len(fi_dfs_list)):
fi_df = fi_dfs_list[i]
_ = sns.barplot(data=fi_df,
x="feature_importance",
y="feature",
ax=axs[i],
palette="rocket"
)
_ = axs[i].set_title(fi_titles_list[i])
# _ = plt.show()
fi_dfs_list = [feature_importance_df_0, feature_importance_df_1, feature_importance_df_2, feature_importance_df_3]
fi_titles_list = ["18 - 39", "40 - 59", "60+", "All"]
for i in range(0, len(fi_dfs_list)):
fi_dfs_list[i]["age_group"] = fi_titles_list[i]
_ = plt.figure(figsize=(7, 5))
_ = sns.barplot(
data=pd.concat(fi_dfs_list, axis=0).groupby("age_group").head(5),
x="feature_importance",
y="feature",
hue="age_group",
palette="rocket",
dodge=True
)
fis_df = pd.concat(fi_dfs_list, axis=1)
fis_df.head(5)
| feature | feature_importance | age_group | feature | feature_importance | age_group | feature | feature_importance | age_group | feature | feature_importance | age_group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | automaticity_put_on_mask | 12.42 | 18 - 39 | automaticity_put_on_mask | 19.44 | 40 - 59 | automaticity_carry_mask | 10.24 | 60+ | automaticity_carry_mask | 10.10 | All |
| 1 | norms_people_present_indoors | 7.77 | 18 - 39 | inst_attitude_sense_of_community | 11.25 | 40 - 59 | norms_people_present_indoors | 6.76 | 60+ | risk_severity | 7.71 | All |
| 2 | aff_attitude_safe | 6.97 | 18 - 39 | inst_attitude_protects_self | 5.38 | 40 - 59 | aff_attitude_comfortable | 6.18 | 60+ | effective_means_masks | 6.89 | All |
| 3 | risk_fear_spread | 6.38 | 18 - 39 | effective_means_distance | 4.80 | 40 - 59 | risk_severity | 5.80 | 60+ | norms_people_present_indoors | 5.15 | All |
| 4 | inst_attitude_no_needless_waste | 6.32 | 18 - 39 | risk_fear_restrictions | 4.63 | 40 - 59 | aff_attitude_safe | 5.61 | 60+ | effective_means_ventilation | 4.49 | All |
fig, axs = plt.subplots(nrows=1,
ncols=4,
sharex=True,
sharey=False,
figsize=(30, 7),
gridspec_kw={'wspace': 0.75})
shap_dfs_list = [shap_values_0, shap_values_1, shap_values_2, shap_values_3]
shap_titles_list = ["18 - 39", "40 - 59", "60+", "All"]
for i in range(0, len(shap_dfs_list)):
shap_df = shap_dfs_list[i]
var_order = shap_df.groupby("variable").var().sort_values(by = "shap_value", ascending = False).index.tolist()
_ = sns.stripplot(data=shap_df,
x="shap_value",
y="variable",
hue="actual_value",
order=var_order,
ax=axs[i],
)
_ = axs[i].set_title(shap_titles_list[i])
tmp_df = df.reset_index(drop=True)
X = tmp_df[features_list[2:]]
y = tmp_df[grouping_var]
accuracies_list = list()
all_pred_test_df = pd.DataFrame()
all_cors_df = pd.DataFrame()
kfold = RepeatedKFold(n_splits=10, n_repeats=10, random_state=42)
fold_number = 1
model = CatBoostRegressor(iterations=500,
depth=None,
learning_rate=1,
loss_function='RMSE',
verbose=False)
# enumerate the splits and summarize the distributions
for train_ix, test_ix in kfold.split(X):
# select rows
train_X, test_X = X.loc[train_ix, :], X.loc[test_ix, :]
train_y, test_y = y.loc[train_ix], y.loc[test_ix]
# summarize train and test composition
train_0, train_1 = len(train_y[train_y==0]), len(train_y[train_y==1])
test_0, test_1 = len(test_y[test_y==0]), len(test_y[test_y==1])
_ = model.fit(X = train_X,
y = train_y,
cat_features=X.columns.tolist())
pred_y = model.predict(test_X)
_ = accuracies_list.append(np.sqrt(mean_squared_error(test_y, pred_y)))
pred_test_df = pd.DataFrame({grouping_var: test_y,
"predict": pred_y,
"fold_number": f"fold_{fold_number}"})
all_pred_test_df = pd.concat([all_pred_test_df,
pred_test_df
])
corr_df = pg.corr(x=pred_test_df[grouping_var],
y=pred_test_df["predict"],
alternative='two-sided',
method='spearman',
)
all_cors_df = pd.concat([all_cors_df,
corr_df.assign(fold_number=f"fold_{fold_number}")
])
fold_number += 1
_ = plt.figure(figsize=(3,5))
_ = sns.boxplot(y = accuracies_list)
_ = sns.swarmplot(y = accuracies_list, edgecolor="white", linewidth=1)
_ = plt.title("RMSE Cat Boost\nRegressor kfold cross validation")
pd.DataFrame(accuracies_list).describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| 0 | 100.0 | 1.460889 | 0.107116 | 1.172973 | 1.387053 | 1.45378 | 1.542548 | 1.713833 |
_ = sns.lmplot(data=all_pred_test_df,
x=grouping_var,
y="predict",
hue="fold_number",
legend=False)
ax = sns.jointplot(data=all_pred_test_df,
x=grouping_var,
y="predict",
hue="fold_number",
# kind="reg",
legend=False
)
# _ = ax._legend.remove()
all_cors_df.describe()
| n | r | p-val | power | |
|---|---|---|---|---|
| count | 100.000000 | 100.000000 | 1.000000e+02 | 100.000000 |
| mean | 227.200000 | 0.337624 | 1.474281e-04 | 0.989853 |
| std | 0.402015 | 0.063481 | 8.058543e-04 | 0.031872 |
| min | 227.000000 | 0.177255 | 1.944096e-15 | 0.766244 |
| 25% | 227.000000 | 0.293919 | 4.683890e-09 | 0.995085 |
| 50% | 227.000000 | 0.338370 | 1.743040e-07 | 0.999554 |
| 75% | 227.000000 | 0.376328 | 6.681203e-06 | 0.999965 |
| max | 228.000000 | 0.495058 | 7.426213e-03 | 1.000000 |
_ = sns.boxplot(data=all_cors_df[["r", "p-val"]].melt(),
x="variable", y="value")
_ = plt.axhline(y=0.05, c="grey", ls="--")
sort_shap_list = (pd.merge(shap_values_3
.assign(shap_value=lambda d: d[["shap_value"]].abs())
.groupby("variable")
.mean()
.sort_values(by="shap_value", ascending=False),
meta_df.loc[meta_df["New variable name"].isin(features_list), ["Item english translation", "New variable name"]],
left_index=True,
right_on="New variable name")
.set_index(["New variable name", "Item english translation"])
.index
.tolist()
# .drop("New variable name", axis=1)
)
sort_shap_long_list = (pd.merge(shap_values_3
.assign(shap_value=lambda d: d[["shap_value"]].abs())
.groupby("variable")
.mean()
.sort_values(by="shap_value", ascending=False),
meta_df.loc[meta_df["New variable name"].isin(features_list), ["Item english translation", "New variable name", 'English lo-anchor', 'English hi-anchor']],
left_index=True,
right_on="New variable name")
.set_index(["New variable name", "Item english translation", 'English lo-anchor', 'English hi-anchor'])
.index
.tolist()
# .drop("New variable name", axis=1)
)
pd.Series([x[1] for x in sort_shap_list])
0 If you got a coronavirus infection, how serious a threat would you rate it to your health? 1 In the indoors spaces I visit, people on the site think I should… 2 Taking a mask with you to a store or public transport, for example, has already become automatic... 3 Using a face mask 4 When I use a face mask, I feel or would feel ... 5 Consequences of measures taken to prevent the spread of the coronavirus... 6 How likely do you think you would get a coronavirus infection in your free time in the next mont... 7 What consequences do you think it has if you use a face mask in your free time? If or when I use... 8 What consequences do you think it has if you use a face mask in your free time? If or when I use... 9 When I use a face mask, I feel or would feel ... 10 Ventilation 11 When I use a face mask, I feel or would feel ... 12 The fact that I would get infected myself .. 13 If two unvaccinated people from different households meet indoors, what means do you think would... 14 That my loved one would get infected... 15 What consequences do you think it has if you use a face mask in your free time? If or when I use... 16 Putting on a mask, for example in a shop or on public transport, has already become automatic fo... 17 What consequences do you think it has if you use a face mask in your free time? If or when I use... 18 Who thinks you should use a face mask and who thinks not? In the following questions, by using a... 19 People at risk think I should .. 20 What consequences do you think it has if you use a face mask in your free time? If or when I use... 21 Spread of coronavirus… 22 When I use a face mask, I feel or would feel ... 23 The authorities think I should .. 24 Keeping a safety distance (2 meters) 25 When I use a face mask, I feel or would feel ... 26 How likely do you think you will get a coronavirus infection in your free time in the next month? dtype: object
def naive_catboost_shap(df: pd.DataFrame,
grouping_var: str,
column_list: list,
plot_title: str,
max_display: int
):
y = df[grouping_var]
X = df[column_list]
model = CatBoostRegressor(iterations=500,
depth=None,
learning_rate=1,
loss_function='RMSE',
verbose=False)
# train the model
_ = model.fit(X, y, cat_features=column_list)
shap_values = model.get_feature_importance(Pool(X, label=y,cat_features=X.columns.tolist()), type="ShapValues")
shap_values = shap_values[:,:-1]
_ = shap.summary_plot(shap_values,
X.astype(int),
feature_names=X.columns,
max_display=max_display,
show=False,
title=plot_title)
shap_plot = plt.gca()
return shap_plot.get_figure()
display_length = 10
short_shap_plot_all = naive_catboost_shap(df = df,
grouping_var = grouping_var,
column_list = features_list,
plot_title="All",
max_display=display_length)
new_axis_list = pd.Series([f"{x[1]}: [{x[2]} - {x[3]}]" for x in sort_shap_long_list[:display_length]]).str.wrap(61).tolist()
# new_axis_list = pd.Series([x[1] for x in sort_shap_list[:display_length]]).str.wrap(61).tolist()
_ = short_shap_plot_all.gca().set_yticklabels(new_axis_list, fontsize=11)
short_shap_plot_all.set_figheight(10)
short_shap_plot_all.set_figwidth(12)
short_shap_plot_all
display_length = df.shape[0]
short_shap_plot_all = naive_catboost_shap(df = df,
grouping_var = grouping_var,
column_list = features_list,
plot_title="All",
max_display=display_length)
new_axis_list = pd.Series([f"{x[1]}: [{x[2]} - {x[3]}]" for x in sort_shap_long_list[:display_length]]).str.wrap(61).tolist()
# new_axis_list = pd.Series([x[1] for x in sort_shap_list[:display_length]]).str.wrap(61).tolist()
_ = short_shap_plot_all.gca().set_yticklabels(new_axis_list, fontsize=11)
short_shap_plot_all.set_figheight(35)
short_shap_plot_all.set_figwidth(12)
short_shap_plot_all